#coding=utf-8
import optparse
import sys
import time
import jieba
from pyhlseg import *


def showProcess(start_time, done_count, length):
	curTime = time.time()
	tm_cost = curTime - start_time
	sys.stdout.write(u'\rtime:%d  count:%d  length:%d  speed:%.1f/秒 - %.1f万字/秒'
					 % (tm_cost, done_count, length, float(done_count)/tm_cost, float(length)/tm_cost/10000.0))

def load_hlseg():
	HylandaSegment.start_jvm()
	HylandaSegment.load_dictionary(user_dict_path=HylandaSegment.BUILD_IN_USER_DICT)
	HylandaSegment.set_option(grain_size=GrainSize.LARGE)
	# 如果分词选项multi_grain_size设为了True，则可以通过change_result_grain_size方法获取其他粒度的分词结果
	#HylandaSegment.set_option(grain_size=GrainSize.LARGE, multi_grain_size=True)

def unload_hlseg():
	HylandaSegment.shutdown_jvm()

if __name__ == "__main__":
	#命令行解析
	usage = "usage: %prog [options] corpus\nexample: %prog /data/test_corpus.txt"
	parser = optparse.OptionParser(usage, version="%prog 0.1.0")
	parser.add_option("-o", "--output", action="store", type="string", dest="output_file", metavar="FILE", help='save result to FILE')
	parser.add_option("-j", action="store_true", dest="test_jieba", default=False, help='use jieba for segment')
	(options, args) = parser.parse_args()

	#参数检查，不正确则退出
	if len(args) != 1:
		parser.print_help()
		sys.exit(1)

	if not options.test_jieba:
		# 加载分词
		load_hlseg()
	startTime = time.time()
	read_count, length = 0, 0
	#读取文件
	with open(args[0], 'r', encoding='utf-8') as corpus:
		file_result = None
		if options.output_file is not None:
			file_result = open(options.output_file, "w", encoding='utf-8')
		for line in corpus:
			length += len(line)
			read_count += 1
			# 分词
			if not options.test_jieba:
				seg_result = HylandaSegment.segment(line)
				# 分词结果通常可以使用toString(词与词空格分隔)、toStringArray或toTokenArray接口访问
				# toTokenArray获得的是Token对象数组，Token对象的主要成员有wordStr、type、subType、
				# natureFlag、userTag等、具体可以参考海量分词说明文档
				# 例如：
				# words = list(segmentor.segment(line).toStringArray())
				# tokens = list(segmentor.segment(line).toTokenArray())
				# 也可以使用seg_to_words方法得到Word类型的列表
				seg_text = str(seg_result.toString())
				# 如果分词选项multi_grain_size设为了True，则可以通过下面的方法获取其他粒度的分词结果
				#seg_result_m = HylandaSegment.change_result_grain_size(seg_result, GrainSize.NORMAL)
				#seg_text = str(seg_result_m.toString())
			else:
				seg_result = jieba.cut(line)
				seg_text = ' '.join(list(seg_result))
			if file_result is not None:
				file_result.write(seg_text)
			if read_count % 10000 == 0:
				showProcess(startTime, read_count, length)
		if file_result is not None:
			file_result.close()
	showProcess(startTime, read_count, length)
	if not options.test_jieba:
		#卸载分词
		unload_hlseg()


